
#include <assert.h>
#include <cublas_v2.h>
#include <iomanip>
#include <iostream>
#include <limits>
#include <random>
#include<algorithm>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/gather.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/random.h>
#include <thrust/scan.h>
#include <vector>

/**********************/
/* cuBLAS ERROR CHECK */
/**********************/
#ifndef cublasSafeCall
#define cublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
#endif

inline void __cublasSafeCall(cublasStatus_t err, const char *file,
                             const int line) {
  if (CUBLAS_STATUS_SUCCESS != err) {
    fprintf(stderr,
            "CUBLAS error in file '%s', line %d\n \nerror %d \nterminating!\n",
            __FILE__, __LINE__, err);
    // getch();
    cudaDeviceReset();
    assert(0);
  }
}

// convert a linear index to a linear index in the transpose
struct transpose_index : public thrust::unary_function<size_t, size_t> {
  size_t m, n;

  __host__ __device__ transpose_index(size_t _m, size_t _n) : m(_m), n(_n) {}

  __host__ __device__ size_t operator()(size_t linear_index) {
    size_t i = linear_index / n;
    size_t j = linear_index % n;

    return m * j + i;
  }
};

// convert a linear index to a row index
struct row_index : public thrust::unary_function<size_t, size_t> {
  size_t n;

  __host__ __device__ row_index(size_t _n) : n(_n) {}

  __host__ __device__

      size_t
      operator()(size_t i) {
    return i / n;
  }
};

// transpose an M-by-N array
template <typename T>
void transpose(size_t m, size_t n, thrust::device_vector<T> &src,
               thrust::device_vector<T> &dst) {
  thrust::counting_iterator<size_t> indices(0);

  thrust::gather(
      thrust::make_transform_iterator(indices, transpose_index(n, m)),
      thrust::make_transform_iterator(indices, transpose_index(n, m)) +
          dst.size(),
      src.begin(), dst.begin());
}

// share memory transpose
template <typename T>
__global__ void __launch_bounds__(1024)
    transpose_kernel(const T *src, T *dst, int dstM, int dstN) {
  __shared__ T share_arrary[32][33];
  const int tx = threadIdx.x;
  const int ty = threadIdx.y;

  for (int block_offest_y = blockIdx.y * blockDim.y; block_offest_y < dstM;
       block_offest_y += blockDim.y * gridDim.y) {
    for (int block_offest_x = blockIdx.x * blockDim.x; block_offest_x < dstN;
         block_offest_x += blockDim.x * gridDim.x) {
      // src coordinate
      int src_col = block_offest_y + tx;
      int src_row = block_offest_x + ty;

      if (src_col < dstM && src_row < dstN) {
        share_arrary[ty][tx] = src[src_row * dstM + src_col]; //合并访存
      }
      __syncthreads();
      // dst coordinate
      // Block thread的坐标映射是根据 dst来着
      int dst_row = block_offest_y + ty;
      int dst_col = block_offest_x + tx;
      if (dst_row < dstM && dst_col < dstN) {
        dst[dst_row * dstN + dst_col] = share_arrary[tx][ty];
      }
    }
  }
}
// print an M-by-N array
template <typename T>
void print(size_t m, size_t n, thrust::device_vector<T> &d_data) {
  thrust::host_vector<T> h_data = d_data;

  for (size_t i = 0; i < m; i++) {
    for (size_t j = 0; j < n; j++)
      std::cout << std::setw(1) << h_data[i * n + j] << " ";
    std::cout << "\n";
  }
}

template <typename T>
bool verify_res(size_t m, size_t n, const thrust::device_vector<T> &ref_data,
                const thrust::device_vector<T> &res_data,
                T abs_error = T(1e-4)) {
  thrust::host_vector<T> href_data = ref_data;
  thrust::host_vector<T> hres_data = res_data;

  T max_error = std::numeric_limits<T>::lowest();
  int num_errors = 0;
  for (size_t i = 0; i < m; i++) {
    for (size_t j = 0; j < n; j++) {
      auto tmp_error = std::abs(hres_data[i * n + j] - href_data[i * n + j]);
      // std::cout<<tmp_error<<"\n";
      if (tmp_error > abs_error)
        num_errors++;
      max_error = max_error > tmp_error ? max_error : tmp_error;
    }
  }
  std::cout << "num_error: " << num_errors << " max error= " << max_error
            << " \n";
  return num_errors == 0;
}
// nvcc transpose.cu -O2 --extended-lambda  -l cublas -o transpose_sample
int main(void) {
  size_t m = 2048; // number of rows
  size_t n = 512; // number of columns

  // 2d array stored in row-major order [(0,0), (0,1), (0,2) ... ]
  std::vector<double> h_data(m * n);

  std::random_device rd;  // 将用于获得随机数引擎的种子
  std::mt19937 gen(rd()); // 以 rd() 播种的标准 mersenne_twister_engine
  std::uniform_real_distribution<double> dis(1, 10);
  std::generate(h_data.begin(), h_data.end(),[&rd,&gen,&dis](){return dis(gen);});
  thrust::device_vector<double> data=h_data;
  /*
    data[1] = 2.;
    data[3] = 3.;
    data[m * n - 1] = 8.;

    thrust::generate(data.begin(), data.end(), [] __device__() {
      thrust::default_random_engine random_engine;

      thrust::uniform_real_distribution<double> dist(0.1, 10.);

      return dist(random_engine);
    });
    */
  std::cout << "Initial array" << std::endl;
  // print(m, n, data);

  std::cout << "Transpose array - Thrust" << std::endl;
  thrust::device_vector<double> transposed_thrust(m * n);
  transpose(m, n, data, transposed_thrust);
  // print(n, m, transposed_thrust);

  std::cout << "Transpose array - cuBLAS" << std::endl;
  thrust::device_vector<double> transposed_cuBLAS(m * n);
  double *dv_ptr_in = thrust::raw_pointer_cast(data.data());
  double *dv_ptr_out = thrust::raw_pointer_cast(transposed_cuBLAS.data());
  double alpha = 1.;
  double beta = 0.;
  cublasHandle_t handle;
  cublasSafeCall(cublasCreate(&handle));
  cublasSafeCall(cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha,
                             dv_ptr_in, n, &beta, dv_ptr_in, n, dv_ptr_out, m));
  // print(n, m, transposed_cuBLAS);
  /*
cudaStream_t stream0;
  cudaStreamCreate(&stream0);

  cublasSetStream_v2(handle, stream0);
*/
  // getch();
  std::cout << "Transpose array - kernel" << std::endl;

  thrust::device_vector<double> transposed_kernel_data(m * n);
  double *dv_ptr_kernel_out =
      thrust::raw_pointer_cast(transposed_kernel_data.data());
  const dim3 block(32, 32);
  const dim3 grid((n+31)/32,(m+31)/32);
  //const dim3 grid(2, 3);

  transpose_kernel<<<grid, block>>>(dv_ptr_in, dv_ptr_kernel_out, n, m);
  verify_res(n, m, transposed_thrust, transposed_kernel_data);
  //   cudaDeviceSynchronize();
  // 
  // print(n, m, transposed_kernel_data);

  return 0;
}